package org.luosl.webmagicx.processor

import java.util.Collections

import org.jsoup.Jsoup
import org.luosl.webmagicx.conf.{Extract, SpiderConf, TargetUrlRegex}
import us.codecraft.webmagic.{Page, Request}
import us.codecraft.webmagic.selector._

import scala.collection.JavaConverters._

/**
  * Created by luosl on 2017/11/6.
  */
class GeneralProcessor(sc: SpiderConf) extends AbstractProcessor(sc){

  private val json:String = "application/json"
  private val html:String = "text/html"

  /**
    * 抽取链接
    * @param types 页面类型
    * @param content 页面内容
    * @param deep 当前深度
    * @return
    */
  def extractLinks(types:String,content:AbstractSelectable,deep:Int): (Int, Seq[(Long, String)]) ={
    val links:Seq[(Long, String)] = if(sc.attribute.maxDeep == -1 || deep < sc.attribute.maxDeep){
      types match {
        case `json` => extractJsonLinks(content)
        case `html` => extractHtmlLinks(content)
      }
    }else{
      logInfo(s"当前深度:$deep,已经大于等于指定最大深度${sc.attribute.maxDeep},跳过链接抽取环节...")
      Seq.empty[(Long, String)]
    }
    (deep + 1, links)
  }

  /**
    * 抽取 json 链接
    * @param content content
    * @return Seq[(优先级, url)]
    */
  private def extractJsonLinks(content:AbstractSelectable): Seq[(Long, String)] ={
    // 获取类型为 jsonPath 的抽取项
    val jsonExtractItems:Seq[TargetUrlRegex] = sc.targetUrlRegexs.filter(_.extract.exists(_.types == "jsonPath"))
    jsonExtractItems match {
      case Seq() => Seq.empty[(Long, String)]
      case _ =>
        jsonExtractItems.flatMap{ target =>
          val jsonPath:String = target.extract.get.expression
          content.jsonPath(jsonPath).all().asScala
            .filter(_.matches(target.regex))
            .map(url=> (target.priority, url))
        }
    }
  }

  /**
    * 抽取 html 的 链接
    * @param content content
    * @return
    */
  private def extractHtmlLinks(content:AbstractSelectable): Seq[(Long, String)] ={
    // html 支持 xpath cssSelector
    val htmlExtactTypes:Set[String] = Set("xpath", "cssSelector")
    // 获取 类型为 xpath 或 cssSelector 的抽取项
    val htmlExtractItems:Seq[TargetUrlRegex] = sc.targetUrlRegexs
      .filter(tr=> tr.extract.exists(e=> htmlExtactTypes.contains(e.types)) || tr.extract.isEmpty )
    if(htmlExtractItems.isEmpty){
      Seq.empty[(Long, String)]
    }else{
      htmlExtractItems.flatMap{ target=>
        val select:Selectable = target.extract match {
          case Some(Extract("xpath", expression)) => content.xpath(expression).links()
          case Some(Extract("cssSelector", expression)) => content.$(expression).links()
          case None => content.links()
        }
        select.regex(target.regex).all().asScala.map(url=> (target.priority, url))
      }
    }
  }

  /**
    * 页面字段你抽取逻辑
    * @param page 页面
    * @return
    */
  override def process(page: Page): Unit = {
    val (types, content):(String, AbstractSelectable) = contentAndType(page)
    try{
      val currentUrl:String = page.getRequest.getUrl.split("#")(0)
      page.putField("_url",currentUrl)
      page.putField("_page",content.toString)
      // 抽取链接 页面中的链接
      val (deep, linkAndPriorities):(Int, Seq[(Long, String)]) = extractLinks(types, content, page.getRequest.getDeep)
      val reqs:Seq[Request] = linkAndPriorities.map(lp=> new Request().setDeep(deep).setPriority(lp._1).setUrl(lp._2))
      page.addTargetRequestList(reqs.asJava)
      // 抽取页面中的字段
      sc.fields.foreach{ field=>
        if(!page.isSkip){
          // 根据配置的xpath抽取数据
          val extracts:Seq[String] = field.extracts.map{ extract=>
            val selectOpt:Option[Selectable] = extractContent(types, content, extract)
            val textWithTag:String = selectOpt.map(_.all().asScala.filterNot(_ == null).mkString).getOrElse("")
            if(field.textFormat) Jsoup.parse(textWithTag).text() else textWithTag
          }
          // 根据xpathSelector再次进行赛选
          val xpathSelectorSelect:String = field.scope match {
            case "head" => extracts.headOption.orNull
            case "last" => extracts.lastOption.orNull
            case "all" => extracts.mkString
            case str:Any => throw new RuntimeException(s"无效的xpathScope[$str]")
          }
          if(isBlank(xpathSelectorSelect) && field.must){
            logInfo(s"由于字段[name=${field.name},type=${field.extracts.map(_.types).mkString(",")},expression=${field.extracts.map(_.expression).mkString(",")}]未抽取到数据,[url=$currentUrl]被跳过...")
            page.setSkip(true)
            onSkip(page,field)
          }else{
            page.putField(field.name,xpathSelectorSelect.trim)
          }
        }
      }
      if(!page.isSkip){
        onSuccess(page)
      }
    }catch {
      case e:Exception =>
        onError(page)
        throw e
    }
  }

  /**
    * 通过请求头的方式 获取页面类型和内容
    * @param page
    * @return
    */
  private def contentAndTypeByHead(page: Page):(String, AbstractSelectable) = {
    val contentTypeHeader:Option[List[String]] = page.getHeaders.asScala.get("Content-Type").map(_.asScala.toList)
    contentTypeHeader match {
      case Some(List(str)) =>
        str.split(";") match {
          case Array(`json`, _) => (json, page.getJson)
          case Array(`html`, _) => (html, page.getHtml)
          case _ => (html, page.getHtml)
        }
      case _ => (html, page.getHtml)
    }
  }

  /**
    * 通过head的方式 获取页面类型和内容
    * @param page
    */
  private def contentAndType(page: Page): (String, AbstractSelectable) ={
    val rawText:String = page.getRawText
    if(rawText.startsWith("{") || rawText.startsWith("[")){
      (json, page.getJson)
    }else{
      (html, page.getHtml)
    }
  }

  def extractContent(types:String, content:AbstractSelectable, extract: Extract): Option[Selectable] ={
    val extractValue = (types, extract.types) match {
      case (`json`, "jsonPath") => content.jsonPath(extract.expression)
      case (`html`, "cssSelector") => content.$(extract.expression)
      case (`html`, "xpath") => content.xpath(extract.expression)
      case _ => null
    }
    Option(extractValue)
  }

  /**
    * 抽取字段内容
    * @param xpath xpath
    * @param scope scope
    * @param textFormat textFormat
    * @param page page
    * @return
    */
  private def extractFieldContent(xpath:String,scope:String,textFormat:Boolean,page: Page):String ={
    val selectable:Selectable = page.getHtml.xpath(xpath)
    val selectText:String = scope match {
      case "get" => selectable.get()
      case "all" => selectable.all().asScala.mkString
      case exe:Any => throw new RuntimeException(s"无效的scope:$exe")
    }
    if(isBlank(selectText)){
      null
    }else{
      if(textFormat) Jsoup.parse(selectText).text() else selectText
    }
  }

  /**
    * 判断文本是否为空
    * @param str str
    * @return
    */
  def isBlank(str:String):Boolean = str==null||str.trim.isEmpty


}
